package darwInvest.news.utility;

/**
 * Get rid of all HTML tags including CSS and javascript
 * @author Jae Yong Sung
 */
public class ParseHTML {
	public String parseHTML(String string) {
		String input=string.replaceAll("[?!$\\]\\[\\)\\(\\}\\{]", "");
		
		// get rid of css
		do {
			int a = input.indexOf("<style");
			if(a==-1) break;
			int b = input.indexOf("</style>",a);
			if(b==-1) break;
			input = input.substring(0,a).concat(input.substring(b+8,input.length()));
		} while(true);
		
		// get rid of javascript
		do {
			int a = input.indexOf("<script");
			if(a==-1) break;
			int b = input.indexOf("</script>",a);
			if(b==-1) break;
			input = input.substring(0,a).concat(input.substring(b+9,input.length()));
		} while(true);

		// get rid of all the html tags
		int prevStart=0;
		do {
			int a = input.indexOf('<',prevStart);
			if(a==-1) break;
			int b = input.indexOf('>',a);
			if(b==-1) break;
			if((b-a)<500) {
				input=input.replaceAll(input.substring(a,b+1), "").trim();
				prevStart=b-(b-a);
			} else {
				prevStart=b+1;
			}
		} while(true);
		
		return input;
	}
}
